{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.sys.path.append(os.path.dirname(os.path.abspath('.')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "data = np.array([[2.5, 3.5, 3, 3.5, 2.5, 3],\n",
    "                 [3, 3.5, 1.5, 5, 3.5, 3],\n",
    "                 [2.5, 3, 0, 3.5, 0, 4],\n",
    "                 [0, 3.5, 3, 0, 4, 4],\n",
    "                 [3, 4, 2, 3, 2, 3],\n",
    "                 [3, 4, 0, 5, 3.5, 3],\n",
    "                 [0, 4.5, 0, 4, 1, 0]])\n",
    "n_users, n_items = data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from metrics.pairwise.euclidean_distances import euclidean_distances\n",
    "\n",
    "dist_mat = euclidean_distances(data.T)    # 两两物品之间的距离矩阵\n",
    "sim_mat = 1/(1+dist_mat)    # 将距离转化成相似度"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "因为需要同时存储物品索引与相似度，这里使用字典结构来存储。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: [(5, 0.1886378647726465), (4, 0.16736577623297264), (3, 0.1639607805437114), (2, 0.15954492995986427), (1, 0.14285714285714285)], 1: [(3, 0.1951941016011038), (5, 0.16952084719853724), (4, 0.16202097927744855), (0, 0.14285714285714285), (2, 0.12002728245132872)], 2: [(4, 0.1886378647726465), (0, 0.15954492995986427), (5, 0.1560469703786189), (1, 0.12002728245132872), (3, 0.1030561550871519)], 3: [(1, 0.1951941016011038), (0, 0.1639607805437114), (5, 0.13579648178933995), (4, 0.13133048602716904), (2, 0.1030561550871519)], 4: [(2, 0.1886378647726465), (5, 0.1876127897984334), (0, 0.16736577623297264), (1, 0.16202097927744855), (3, 0.13133048602716904)], 5: [(0, 0.1886378647726465), (4, 0.1876127897984334), (1, 0.16952084719853724), (2, 0.1560469703786189), (3, 0.13579648178933995)]}\n"
     ]
    }
   ],
   "source": [
    "# 两矩阵分别存储最相似物品的索引与相似度\n",
    "sim_idx_mat = np.argsort(-sim_mat, axis=1)[:, 1:]\n",
    "sim_val_mat = -np.sort(-sim_mat)[:, 1:]\n",
    "\n",
    "sim_dict = dict()\n",
    "for row in range(n_items):\n",
    "    sim_dict[row] = [(item_idx, sim_val) for item_idx,\n",
    "                     sim_val in zip(sim_idx_mat[row], sim_val_mat[row])]\n",
    "\n",
    "print(sim_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "现在仍需要对最后一个用户做推荐，那么不再需要去原数据中找出相似用户的数据，而是根据用户本身，去找到最相似的候选物品。首先提取该用户已有的历史数据，与用户未曾接触过的物品(候选推荐)："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 3 4] [0 2 5]\n"
     ]
    }
   ],
   "source": [
    "i = 6    # 最后一个用户\n",
    "\n",
    "seen_items = np.arange(n_items)[data[i] != 0]    # 用户的历史数据\n",
    "\n",
    "cand_items = np.arange(n_items)[data[i] == 0]    # 候选推荐\n",
    "\n",
    "print(seen_items, cand_items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = dict()\n",
    "tol_sim = dict()\n",
    "\n",
    "# 遍历用户历史数据中的已评分物品\n",
    "for idx, val in enumerate(data[i]):\n",
    "    if idx in cand_items:\n",
    "        continue\n",
    "\n",
    "    # 遍历相似的候选物品\n",
    "    for sim_item, sim_val in sim_dict[idx]:\n",
    "        if sim_item in cand_items:\n",
    "            scores[sim_item] = scores.get(sim_item, 0)\n",
    "            scores[sim_item] += sim_val*val    # 加权评分\n",
    "\n",
    "            tol_sim[sim_item] = tol_sim.get(sim_item, 0)\n",
    "            tol_sim[sim_item] += sim_val    # 候选物品相似度求和，用作分母\n",
    "\n",
    "ranking = [(item, score/tol_sim[item]) for item, score in scores.items()]\n",
    "ranking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
